Required packages: ggthemes, fixest, broom, tidyverse
# load required packages
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
ggthemes, fixest, broom, tidyverse
)
To see how DD works in practice we will first generate a fake dataset where we know the true value of what we want to estimate. This is a good skill to have in order to make sure you understand how methods work, and that your code does what you want it to do. If you know what the result is and your code produces something different, your code is wrong.
To generate fake data we need to model the true data generating process. We will make this exactly the DD model, but with some added noise.
Any time you introduce randomness into your code, you will want to set a random number generator seed so it is reproducible.
# Initialize RNG
set.seed(123456)
Every time you run this notebook you will get the exact same results even though we are drawing “random” numbers.
The model generating our data will be very simple: 1. A time-invariant factor specific to both NY and PA 2. A period-specific factor common across both NY and PA 3. A treatment effect in NY after the fake policy is implemented 4. Some added random noise in outcomes
First we need to define the time-invariant factors.
# Fixed state characteristics: ny has more biodiversity
state_fes <- tribble(~state, ~state_fe,
"NY", 88,
"PA", 44)
Here we used a tribble, which is just a tibble but where we can write it out element-by-element. This is helpful when you need to initialize small data frames.
Next we need the common, period-specific factors.
# Period characteristics: biodiversity is declining
period_fes <- tribble(~period, ~period_fe,
"before", 341,
"after", 207)
Last, we need to define the size of the treatment effect.
# True effect of the policy: how many additional species are saved?
treatment_effect <- 55
Now we have defined all the parameters required to generate our data. Next, we need to actually generate the data. This takes a few steps.
First, we need to generate N observations for each combination of before/after and NY/PA. We can do this with crossing.
# All combos of states, periods, and observation numbers
bio_df <- crossing(
state = c("NY", "PA"),
period = c("before", "after"),
obs = 1:500
)
bio_df
## # A tibble: 2,000 x 3
## state period obs
## <chr> <chr> <int>
## 1 NY after 1
## 2 NY after 2
## 3 NY after 3
## 4 NY after 4
## 5 NY after 5
## 6 NY after 6
## 7 NY after 7
## 8 NY after 8
## 9 NY after 9
## 10 NY after 10
## # … with 1,990 more rows
Second, we need to bring in the time-invariant, and period-specific factors with join functions.
bio_df <- bio_df %>%
inner_join(period_fes) %>%
inner_join(state_fes)
## Joining, by = "period"
## Joining, by = "state"
bio_df
## # A tibble: 2,000 x 5
## state period obs period_fe state_fe
## <chr> <chr> <int> <dbl> <dbl>
## 1 NY after 1 207 88
## 2 NY after 2 207 88
## 3 NY after 3 207 88
## 4 NY after 4 207 88
## 5 NY after 5 207 88
## 6 NY after 6 207 88
## 7 NY after 7 207 88
## 8 NY after 8 207 88
## 9 NY after 9 207 88
## 10 NY after 10 207 88
## # … with 1,990 more rows
Third, it will be helpful to define a variable for units that are treated: New York and after.
bio_df <- bio_df %>%
mutate(treated = state == "NY" & period == "after",
period = factor(period, levels = c("before", "after")),
period = relevel(period, "before"))
bio_df
## # A tibble: 2,000 x 6
## state period obs period_fe state_fe treated
## <chr> <fct> <int> <dbl> <dbl> <lgl>
## 1 NY after 1 207 88 TRUE
## 2 NY after 2 207 88 TRUE
## 3 NY after 3 207 88 TRUE
## 4 NY after 4 207 88 TRUE
## 5 NY after 5 207 88 TRUE
## 6 NY after 6 207 88 TRUE
## 7 NY after 7 207 88 TRUE
## 8 NY after 8 207 88 TRUE
## 9 NY after 9 207 88 TRUE
## 10 NY after 10 207 88 TRUE
## # … with 1,990 more rows
Now we need to generate our data by adding the time-invariant effect state_fe, the period-specific effect period_fe, and some random noise given by rnorm. If the observation is New York after the policy we will also add treatment_effect.
bio_df <- bio_df %>%
mutate(
biodiversity = ifelse(
treated,
period_fe + state_fe + treatment_effect + 100*rnorm(n()),
period_fe + state_fe + 100*rnorm(n())
)
)
bio_df
## # A tibble: 2,000 x 7
## state period obs period_fe state_fe treated biodiversity
## <chr> <fct> <int> <dbl> <dbl> <lgl> <dbl>
## 1 NY after 1 207 88 TRUE 433.
## 2 NY after 2 207 88 TRUE 322.
## 3 NY after 3 207 88 TRUE 314.
## 4 NY after 4 207 88 TRUE 359.
## 5 NY after 5 207 88 TRUE 575.
## 6 NY after 6 207 88 TRUE 433.
## 7 NY after 7 207 88 TRUE 481.
## 8 NY after 8 207 88 TRUE 600.
## 9 NY after 9 207 88 TRUE 467.
## 10 NY after 10 207 88 TRUE 307.
## # … with 1,990 more rows
Let’s plot the data. New York is in black, Pennsylvania is in orange.
# Switch the order of the period variables so before comes first
bio_df$period = factor(bio_df$period, levels = c("before", "after"))
ggplot(bio_df, group = interaction(state, period)) +
geom_jitter(aes(x = period, y = biodiversity, color = interaction(state), shape = interaction(state)), size = 3) +
annotate("text", size = 8, label = "NY", x = 1, y = 700) +
annotate("text", size = 8, label = "PA", x = 1, y = 100, color = "orange") +
scale_color_colorblind() +
theme_minimal() +
scale_x_discrete(labels = c("Before Treatment", "After Treatment")) +
scale_y_continuous(limits = c(50, 700)) +
labs(
x = "Time",
y = "Biodiversity",
title = "The raw fake data"
) +
theme(
legend.position = "none",
title = element_text(size = 24),
axis.text.x = element_text(size = 24), axis.text.y = element_text(size = 24),
axis.title.x = element_text(size = 24), axis.title.y = element_text(size = 24),
panel.grid.minor.x = element_blank(), panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank(),
axis.line = element_line(colour = "black")
)
## Warning: Removed 13 rows containing missing values (geom_point).
Next we can estimate the effect using DD.
# Outcome ~ treatment + period fixed effect + state fixed effect, data
estimates <- fixest::feols(
biodiversity ~ treated + period + state,
data = bio_df
) %>%
broom::tidy()
estimates
## # A tibble: 4 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 436. 4.43 98.3 0.
## 2 treatedTRUE 54.7 8.87 6.17 8.14e-10
## 3 periodafter -139. 6.27 -22.2 7.41e-98
## 4 statePA -48.0 6.27 -7.65 3.04e-14
Compare the true values (left) versus the estimates (right). DD correctly recovers the true treatment effect!
c(treatment_effect, estimates$estimate[2]) # treatment_effect
## [1] 55.00000 54.72843
c(period_fes$period_fe[period_fes$period == "after"] - period_fes$period_fe[period_fes$period == "before"], estimates$estimate[3]) # period_after
## [1] -134.0000 -139.2645
c(state_fes$state_fe[state_fes$state == "PA"] - state_fes$state_fe[state_fes$state == "NY"], estimates$estimate[4]) # statePA
## [1] -44.00000 -47.98205
Now let’s look at what happens if take more naive approaches that do not address time-invariant factors, or common-period specific factors. We will look at how the failure to address these things biases our results in predictible ways.
Suppose we just regressed biodiversity on the existence of the conservation policy.
# Outcome ~ treatment, data
fixest::feols(
biodiversity ~ treated,
data = bio_df
) %>%
broom::tidy()
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 357. 3.11 115. 0
## 2 treatedTRUE -6.13 6.23 -0.983 0.326
We underestimate the true effect because we did not difference out the common decline in biodiversity over time. The estimated effect of the policy in NY is confounded by the common decline in biodiversity across both states.
What if we controlled for these common, period-specific factors?
# Outcome ~ treatment + period fixed effects, data
fixest::feols(
biodiversity ~ treated + period,
data = bio_df
) %>%
broom::tidy()
## # A tibble: 3 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 412. 3.18 129. 0.
## 2 treatedTRUE 103. 6.36 16.2 3.06e- 55
## 3 periodafter -163. 5.51 -29.6 2.47e-160
Now we overestimate the true effect. We (still) did not difference out the fact that NY tends to have higher levels of biodiversity. The estimated treatment effect is biased upward because NY is just more biodiverse even without the policy.
########################################
## Read in blood lead data
bll_df <- read_csv("data/hr_2021_bll.csv")
## Parsed with column specification:
## cols(
## state_fips = col_double(),
## county_fips = col_double(),
## percent_high_conditional_tested = col_double(),
## ihs_bll = col_double(),
## race = col_double(),
## year = col_double(),
## weight = col_double(),
## unemp_rate = col_double(),
## median_income = col_double(),
## percent_non_white = col_double(),
## lead_emitted = col_double(),
## ap = col_double()
## )
########################################
## Look at data
bll_df
## # A tibble: 22,887 x 12
## state_fips county_fips percent_high_co… ihs_bll race year weight unemp_rate
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 0.752 0.695 0 2015 11.5 0.052
## 2 1 1 0 0 0 2014 5.39 0.059
## 3 1 1 0 0 0 2013 4.47 0.062
## 4 1 1 0 0 0 2012 3.16 0.069
## 5 1 1 0 0 0 2011 6 0.083
## 6 1 1 0 0 0 2010 13.4 0.089
## 7 1 1 1.14 0.979 0 2009 13.2 0.0970
## 8 1 1 0.543 0.520 0 2008 13.6 0.051
## 9 1 1 0.610 0.577 0 2007 12.8 0.033
## 10 1 1 4.44 2.20 0 2006 6.71 0.033
## # … with 22,877 more rows, and 4 more variables: median_income <dbl>,
## # percent_non_white <dbl>, lead_emitted <dbl>, ap <dbl>
########################################
## Plot raw data trends
bll_df %>%
group_by(race, year) %>%
summarise(percent_high_conditional_tested = weighted.mean(percent_high_conditional_tested, weight, na.rm = T)) %>%
filter(year >= 2005 & year <= 2009) %>%
ggplot(aes(x = year, y = percent_high_conditional_tested)) +
geom_vline(xintercept = 2006.5, color = "grey55", linetype = "dashed") +
geom_point(aes(shape = as.factor(race), color = as.factor(race)), size = 5) +
annotate("text", size = 4, label = "Treated", x = 2005.25, y = 2.85) +
annotate("text", size = 4, label = "Border", x = 2005.25, y = 2.5) +
annotate("text", size = 4, label = "Control", x = 2005.25, y = 1.4) +
annotate("text", size = 6, label = "Leaded Fuel", x = 2005.50, y = 0.25) +
annotate("text", size = 6, label = "Unleaded Fuel", x = 2008, y = 0.25) +
theme_minimal() +
scale_color_colorblind() +
labs(
x = "Year",
y = "Percent Lead Poisoned",
title = "Average lead poisoning rates by type (balanced panel)"
) +
theme(
legend.position = "none",
title = element_text(size = 14),
axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14),
axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14),
panel.grid.minor.x = element_blank(), panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank(),
axis.line = element_line(colour = "black")
)
## `summarise()` regrouping output by 'race' (override with `.groups` argument)
########################################
## Difference-in-differences estimate
fixest::feols(ihs_bll ~ as.factor(race)*I(year > 2007) | county_fips + year,
weights = ~weight,
data = bll_df) %>%
broom::tidy(cluster = "state_fips")
## The variable 'I(year > 2007)TRUE' has been removed because of collinearity (see $collin.var).
## # A tibble: 4 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 as.factor(race)1 0.0660 0.100 0.658 0.511
## 2 as.factor(race)2 0.252 0.132 1.91 0.0567
## 3 as.factor(race)1:I(year > 2007)TRUE -0.0495 0.0597 -0.829 0.407
## 4 as.factor(race)2:I(year > 2007)TRUE -0.227 0.0907 -2.50 0.0124
########################################
## Event study
# Need to factor year so we can omit 2007
bll_df$year <- factor(bll_df$year, ordered = FALSE)
bll_df$year <- relevel(bll_df$year, 3)
# Estimate event study
event_study <- fixest::feols(ihs_bll ~ as.factor(race)*year + as.factor(state_fips)*year | county_fips + year,
weights = ~weight,
data = bll_df) %>%
tidy(cluster = "state_fips") %>%
filter(str_detect(term, "^as.factor\\(race\\)2:year")) %>%
add_row(estimate = 0, std.error = 0, .after = 2) %>%
mutate(year = row_number() + 2004)
## Variables 'year2005', 'year2006' and 62 others have been removed because of collinearity (see $collin.var).
# Plot event study
ggplot(data = event_study, aes(x = year, y = estimate)) +
geom_hline(yintercept = 0, size = 0.5) +
geom_vline(xintercept = 2006.5, color = "grey55", linetype = "dashed") +
geom_ribbon(aes(ymin = estimate - 1.96*std.error, ymax = estimate + 1.96*std.error),
fill = "darkslateblue", alpha = 0.4) +
geom_line(size = 1) +
geom_point(size = 5) +
annotate("text", size = 4, label = "Leaded Fuel", x = 2005.50, y = -0.26) +
annotate("text", size = 4, label = "Unleaded Fuel", x = 2012, y = -0.26) +
theme_minimal() +
scale_color_colorblind() +
scale_x_continuous(breaks = seq(2005, 2015, 2)) +
labs(
x = "Year",
y = "Percent Lead Poisoned",
title = "Percent lead poisoned relative to 2007 (first unleaded year)"
) +
theme(
legend.position = "none",
title = element_text(size = 14),
axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14),
axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14),
panel.grid.minor.x = element_blank(), panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank(),
axis.line = element_line(colour = "black")
)
########################################
## Read in blood lead data
mortality_df <- read_csv("data/hr_2021_mortality.csv")
## Parsed with column specification:
## cols(
## state_fips = col_double(),
## county_fips = col_double(),
## year = col_double(),
## race = col_double(),
## age_adjusted_rate = col_double(),
## weight = col_double()
## )
mortality_df
## # A tibble: 58,107 x 6
## state_fips county_fips year race age_adjusted_rate weight
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 2017 0 4856. 0
## 2 1 1 2016 0 4967. 235.
## 3 1 1 2015 0 5118. 235.
## 4 1 1 2014 0 4651. 234.
## 5 1 1 2013 0 5460 234.
## 6 1 1 2012 0 5863. 235.
## 7 1 1 2011 0 4920. 235.
## 8 1 1 2010 0 5154. 234.
## 9 1 1 2009 0 4932. 233.
## 10 1 1 2008 0 5705. 231.
## # … with 58,097 more rows
# Need to factor year so we can omit 2007
mortality_df$year <- factor(mortality_df$year, ordered = FALSE)
mortality_df$year <- relevel(mortality_df$year, 9)
# Estimate event study
event_study <- fixest::feols(age_adjusted_rate ~ as.factor(race)*year + as.factor(state_fips)*year | county_fips + year,
weights = ~weight,
data = mortality_df) %>%
broom::tidy(cluster = "state_fips") %>%
filter(str_detect(term, "^as.factor\\(race\\)2:year")) %>%
add_row(estimate = 0, std.error = 0, .after = 8) %>%
mutate(year = row_number() + 1998)
## NOTES: 1,870 observations removed because of 0-weight.
## 2,212 observations removed because of NA values (Breakup: LHS: 2,212, RHS: 0, Fixed-effects: 0, Weights: 0).
## Variables 'year1999', 'year2000' and 15 others have been removed because of collinearity (see $collin.var).
# Plot event study
ggplot(data = event_study, aes(x = year, y = estimate)) +
geom_hline(yintercept = 0, size = 0.5) +
geom_vline(xintercept = 2006.5, color = "grey55", linetype = "dashed") +
geom_ribbon(aes(ymin = estimate - 1.96*std.error, ymax = estimate + 1.96*std.error),
fill = "darkslateblue", alpha = 0.4) +
geom_line(size = 1) +
geom_point(size = 5) +
annotate("text", size = 4, label = "Leaded Fuel", x = 2002, y = -200) +
annotate("text", size = 4, label = "Unleaded Fuel", x = 2012, y = -200) +
theme_minimal() +
scale_color_colorblind() +
scale_x_continuous(breaks = seq(1999, 2015, 2)) +
labs(
x = "Year",
y = "Age-Adjusted Mortality Rate (deaths per 100,000)",
title = "Mortality rate relative to 2007 (first unleaded year)"
) +
theme(
legend.position = "none",
title = element_text(size = 14),
axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14),
axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14),
panel.grid.minor.x = element_blank(), panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank(),
axis.line = element_line(colour = "black")
)